knitr::opts_knit$set(root.dir = "/home/francescoc/Desktop/scGraphVerse/data",
warning = FALSE,
message = FALSE,
error = FALSE)
Load Libraries
library(biomaRt)
library(DT)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::select() masks biomaRt::select()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(Seurat)
## Loading required package: SeuratObject
## Loading required package: sp
##
## Attaching package: 'SeuratObject'
##
## The following object is masked from 'package:DT':
##
## JS
##
## The following objects are masked from 'package:base':
##
## intersect, t
##
##
## Attaching package: 'Seurat'
##
## The following object is masked from 'package:DT':
##
## JS
library(STRINGdb)
library(scGraphVerse)
## Warning: replacing previous import 'igraph::components' by 'Seurat::components'
## when loading 'scGraphVerse'
Load Atlas Data
options(timeout = 600)
# PBMC Dataset: Local and systemic responses to SARS-CoV-2 infection in children and adults
seurat_url <- "https://datasets.cellxgene.cziscience.com/89619149-162f-4839-8e97-24735924417c.rds"
seurat_object <- download_Atlas(seurat_url)
seurat_subset <- subset(
x = seurat_object,
subset = disease == "normal" & cell_type == "CD4-positive helper T cell" & donor_id %in% c("AN6", "AN9", "NP18")
)
meta_features <- seurat_subset[["RNA"]]@meta.features
stopifnot(all(rownames(meta_features) == rownames(seurat_subset[["RNA"]]@data)))
new_rownames <- meta_features$name
new_rownames <- seurat_subset@assays$RNA@meta.features$name
if (length(new_rownames) != nrow(seurat_subset[["RNA"]]@data)) {
stop("The length of new_rownames does not match the number of features.")
}
assay <- seurat_subset[["RNA"]] # Access the RNA assay
rownames(assay@counts) <- new_rownames
rownames(assay@data) <- new_rownames
rownames(assay@meta.features) <- new_rownames
seurat_subset[["RNA"]] <- assay
Filtering step
seurat_subset <- PercentageFeatureSet(seurat_subset, "^MT-", col.name = "percent_mito")
# Ribosomal
seurat_subset <- PercentageFeatureSet(seurat_subset, "^RP[SL]", col.name = "percent_ribo")
feats <- c("nFeature_RNA", "nCount_RNA", "percent_mito", "percent_ribo")
VlnPlot(seurat_subset, group.by = "donor_id", split.by = "donor_id", features = feats, pt.size = 0.1, ncol = 2)
## The default behaviour of split.by has changed.
## Separate violin plots are now plotted side-by-side.
## To restore the old behaviour of a single split violin,
## set split.plot = TRUE.
##
## This message will be shown once per session.

FeatureScatter(seurat_subset, "nCount_RNA", "nFeature_RNA", group.by = "donor_id", pt.size = .5)

selected_c <- WhichCells(seurat_subset, expression = nFeature_RNA > 200)
selected_f <- rownames(seurat_subset)[Matrix::rowSums(seurat_subset) > 3]
seurat_subset_filt <- subset(seurat_subset, features = selected_f, cells = selected_c)
dim(seurat_subset_filt)
## [1] 12342 1431
table(seurat_subset_filt$donor_id)
##
## AN1 AN2 AN3 AN5 AN6 AN7 AN9 AN11 AN12 AN13 AN14 AP1 AP2 AP3 AP4 AP5
## 0 0 0 0 495 0 435 0 0 0 0 0 0 0 0 0
## AP6 AP7 AP8 AP9 AP10 AP11 AP12 AP13 NP13 NP15 NP16 NP17 NP18 NP19 NP20 NP21
## 0 0 0 0 0 0 0 0 0 0 0 0 501 0 0 0
## NP22 NP23 NP24 NP26 NP27 NP28 NP30 NP31 NP32 NP35 NP36 NP37 NP38 NP39 NP41 NP44
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## PC2 PC5 PC6 PC9 PC10 PC11 PC17 PC18 PC19 PC21 PC24 PC26 PC27 PC29 PP1 PP2
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## PP3 PP4 PP5 PP6 PP8 PP9 PP11 PP13 PP14 PP15 PP16
## 0 0 0 0 0 0 0 0 0 0 0
C <- seurat_subset_filt@assays$RNA@counts
C@x <- C@x / rep.int(colSums(C), diff(C@p)) * 100
most_expressed <- order(Matrix::rowSums(C), decreasing = T)[20:1]
boxplot(as.matrix(t(C[most_expressed, ])),
cex = 0.1, las = 1, xlab = "Percent counts per cell",
col = (scales::hue_pal())(20)[20:1], horizontal = TRUE
)

selected_mito <- WhichCells(seurat_subset_filt, expression = percent_mito < 20)
selected_ribo <- WhichCells(seurat_subset_filt, expression = percent_ribo > 5)
# and subset the object to only keep those cells
seurat_subset_filt <- subset(seurat_subset_filt, cells = selected_mito)
seurat_subset_filt <- subset(seurat_subset_filt, cells = selected_ribo)
dim(seurat_subset_filt)
## [1] 12342 1431
feats <- c("nFeature_RNA", "nCount_RNA", "percent_mito", "percent_ribo")
VlnPlot(seurat_subset_filt, group.by = "donor_id", features = feats, pt.size = 0.1, ncol = 2) + NoLegend()

# Filter MALAT1
seurat_subset_filt <- seurat_subset_filt[!grepl("MALAT1", rownames(seurat_subset_filt)), ]
# Filter Mitocondrial
seurat_subset_filt <- seurat_subset_filt[!grepl("^MT-", rownames(seurat_subset_filt)), ]
# Filter Ribossomal gene (optional if that is a problem on your data)
seurat_subset_filt <- seurat_subset_filt[ ! grepl("^RP[SL]", rownames(seurat_subset_filt)), ]
dim(seurat_subset_filt)
## [1] 12233 1431
C <- seurat_subset_filt@assays$RNA@counts
C@x <- C@x / rep.int(colSums(C), diff(C@p)) * 100
most_expressed <- order(Matrix::rowSums(C), decreasing = T)[20:1]
boxplot(as.matrix(t(C[most_expressed, ])),
cex = 0.1, las = 1, xlab = "Percent counts per cell",
col = (scales::hue_pal())(20)[20:1], horizontal = TRUE
)

Adjacency matrix top1200
pathgenes <- pathg(seurat_object = seurat_subset_filt, cell_type = "CD4-positive helper T cell", top_n = 1200)
## Top 1200 genes selected based on mean expression.
options(timeout = 2000)
result <- stringdb_adjacency(
genes = pathgenes,
species = 9606,
required_score = 900,
keep_all_genes = F
)
## Initializing STRINGdb...
## Mapping genes to STRING IDs...
## Warning: we couldn't map to STRING 1% of your identifiers
## Mapped 1185 genes to STRING IDs.
## Retrieving **physical** interactions from STRING API...
## Found 1419 STRING physical interactions.
## Adjacency matrices constructed successfully.
wadjm <- result$weighted
adjm <- result$binary
common_names <- intersect(rownames(adjm), colnames(adjm))
adjm <- adjm[common_names, common_names, drop = FALSE]
sorted_names <- sort(rownames(adjm))
adjm <- adjm[sorted_names, sorted_names]
print(dim(wadjm))
## [1] 554 554
print(dim(adjm))
## [1] 554 554
write.table(adjm, "./../analysis/adjm_top1200.txt", sep = "\t", quote = FALSE, col.names = TRUE, row.names = TRUE)
write.table(wadjm, "./../analysis/wadjm_top1200.txt", sep = "\t", quote = FALSE, col.names = TRUE, row.names = TRUE)
wadjm %>%
datatable(extensions = 'Buttons',
options = list(
dom = 'Bfrtip',
buttons = c('csv', 'excel'),
scrollX = TRUE,
pageLength = 10),
caption = "BioGRID Adjacency Top1000 genes")
## Warning in instance$preRenderHook(instance): It seems your data is too big for
## client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html
adjm %>%
datatable(extensions = 'Buttons',
options = list(
dom = 'Bfrtip',
buttons = c('csv', 'excel'),
scrollX = TRUE,
pageLength = 10),
caption = "BioGRID Adjacency Top1000 genes")
## Warning in instance$preRenderHook(instance): It seems your data is too big for
## client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html
Split data for donor_id
genes_to_keep <- intersect(pathgenes, rownames(seurat_subset_filt))
seurat_subset_filt <- subset(seurat_subset_filt, features = genes_to_keep)
saveRDS(seurat_subset_filt, "PBMC.top1200.RDS")
ncells <- as.matrix(seurat_subset_filt@assays$RNA@counts)
ncells <- ncells[rownames(adjm),]
mean(colSums(ncells))
## [1] 832.0161
Create Ground Truth Network
gtruth <- igraph::graph_from_adjacency_matrix(adjm, mode = "undirected", diag = FALSE)
num_nodes <- igraph::vcount(gtruth)
num_edges <- igraph::ecount(gtruth)
set.seed(1234)
p1 <- ggraph::ggraph(gtruth, layout = "fr") +
ggraph::geom_edge_link(color = "gray", width = 0.5) + # Set edge color and width
ggraph::geom_node_point(color = "steelblue", size = 0.7) + # Set node color and size
labs(title = paste("Ground Truth\nNodes:", igraph::vcount(gtruth), "Edges:", igraph::ecount(gtruth))) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, size = 14, face = "bold")
)
p1

ggsave("./../analysis/gtruth_top1200_s300.png", p1, width = 8, height = 6, dpi = 300, bg = "white")
Simulate n500 p677 matrices
ncell <- 200
nodes <- nrow(adjm)
set.seed(1130)
mu_values <- c(3, 6, 9)
theta_values <- c(1, 0.7, 0.5)
count_matrices <- lapply(1:3, function(i) {
set.seed(1130 + i)
mu_i <- mu_values[i]
theta_i <- theta_values[i]
count_matrix_i <- learn2count::simdata(n = ncell, p = nodes, B = adjm, family = "ZINB",
mu = mu_i, mu_noise = 1, theta = theta_i, pi = 0.2)
count_matrix_df <- as.data.frame(count_matrix_i)
colnames(count_matrix_df) <- colnames(adjm)
rownames(count_matrix_df) <- paste("cell", 1:nrow(count_matrix_df), sep = "")
return(count_matrix_df)
})
count_matrices[[1]] %>%
datatable(extensions = 'Buttons',
options = list(
dom = 'Bfrtip',
buttons = c('csv', 'excel'),
scrollX = TRUE,
pageLength = 10),
caption = "Simulated Matrix")
#' Generate multiple zero-inflated negative binomial count matrices with cell-specific depth
#'
#' Simulates zero-inflated negative binomial (ZINB) data using an adjacency matrix,
#' generating `kmat` different matrices with cell-specific sequencing depth.
#'
#' @param n Integer. Number of samples (cells).
#' @param p Integer. Number of variables (genes).
#' @param B Matrix. A symmetric adjacency matrix (binary: 0/1), with row and column names as gene names.
#' @param mu_range List of numeric vectors. Each element is a numeric vector of length 2 specifying `mu` range per matrix.
#' @param mu_noise Numeric vector of length `kmat`. Mean of the noise component for each matrix.
#' @param theta Numeric vector of length `kmat`. Dispersion parameter of the negative binomial distribution for each matrix.
#' @param pi Numeric vector of length `kmat`. Probability of excess zeros (0 < pi < 1) for each matrix.
#' @param kmat Integer. Number of matrices to generate.
#' @param depth_range Numeric vector of length 2 or NA. If provided, defines the range of total counts per cell (e.g., `c(500, 5000)`). Default: `NA` (no depth scaling).
#' @param seed Integer (optional). Random seed for reproducibility.
#'
#' @return A list of `kmat` numeric matrices, each of dimension `n x p`, where row names correspond to gene names from `B`.
#' @importFrom distributions3 rzinbinom
#' @export
zinb_simdata <- function(n, p, B, mu_range, mu_noise, theta, pi, kmat = 1, depth_range = NA, seed = NULL) {
if (!is.null(seed)) set.seed(seed)
stopifnot(is.numeric(n), n > 0, floor(n) == n)
stopifnot(is.numeric(p), p > 0, floor(p) == p)
stopifnot(is.matrix(B), nrow(B) == ncol(B))
stopifnot(all(B %in% c(0, 1)))
stopifnot(is.numeric(kmat), kmat > 0, floor(kmat) == kmat)
stopifnot(length(mu_range) == kmat, all(sapply(mu_range, function(x) length(x) == 2 && all(x > 0))))
stopifnot(length(mu_noise) == kmat, all(mu_noise >= 0))
stopifnot(length(theta) == kmat, all(theta > 0))
stopifnot(length(pi) == kmat, all(pi > 0 & pi < 1))
if (!is.na(depth_range[1])) {
stopifnot(is.numeric(depth_range), length(depth_range) == 2, all(depth_range > 0), depth_range[1] < depth_range[2])
}
gene_names <- rownames(B)
cellID <- paste0("cell_", seq_len(n))
B <- ifelse(B > 0, 1, 0)
edges <- which(B == 1, arr.ind = TRUE)
edges <- edges[edges[, 1] < edges[, 2], ]
A <- diag(1, nrow = p, ncol = p)
for (i in seq_len(nrow(edges))) {
tmp <- rep(0, p)
tmp[edges[i, ]] <- 1
A <- cbind(A, tmp)
}
B[edges] <- sample(seq(min(unlist(mu_range)), max(unlist(mu_range))), length(edges[, 1]), replace = TRUE)
B <- (B | t(B)) * 1
matrices <- vector("list", kmat)
for (k in seq_len(kmat)) {
mu <- runif(p, mu_range[[k]][1], mu_range[[k]][2])
sigma <- B
nonzero_sigma <- sigma[lower.tri(sigma) & sigma != 0]
Y_mu <- c(mu, nonzero_sigma)
Y <- matrix(rzinbinom(length(Y_mu) * n, mu = rep(Y_mu, each = n), theta = theta[k], pi = pi[k]),
nrow = length(Y_mu), ncol = n)
X <- A %*% Y
noise_matrix <- matrix(rzinbinom(n * p, mu = mu_noise[k], theta = 1, pi = pi[k]), nrow = p, ncol = n)
X <- X + noise_matrix
X <- t(X)
if (!is.null(gene_names)) colnames(X) <- gene_names
rownames(X) <- cellID
# Deep coverage range da aggiungere
if (!is.na(depth_range[1])) {
cell_depths <- runif(n, min = depth_range[1], max = depth_range[2])
row_sums <- rowSums(X)
row_sums[row_sums == 0] <- 1
X <- sweep(X, 1, row_sums, FUN = "/")
X[is.na(X)] <- 0
X <- sweep(X, 1, cell_depths, FUN = "*")
X <- round(X)
}
matrices[[k]] <- X
}
return(matrices)
}
library(distributions3)
##
## Attaching package: 'distributions3'
## The following object is masked from 'package:stats':
##
## Gamma
## The following object is masked from 'package:grDevices':
##
## pdf
simat <- zinb_simdata(
n = ncell,
p = nodes,
B = adjm,
mu_range = list(c(1, 4), c(1, 7), c(1, 10)),
mu_noise = c(1, 3, 5),
theta = c(1, 0.7, 0.5),
pi = c(0.2, 0.2, 0.2),
kmat = 3,
depth_range = c((3*nodes)-500, (3*nodes)+500)
)
saveRDS(simat, "./../analysis/sim_n200p500.RDS")
simat[[1]] %>%
datatable(extensions = 'Buttons',
options = list(
dom = 'Bfrtip',
buttons = c('csv', 'excel'),
scrollX = TRUE,
pageLength = 10),
caption = "Simulated Matrix 1")